24#ifndef _IMMINTRIN_H_INCLUDED
25#error "Never use <avx512vlintrin.h> directly; include <immintrin.h> instead."
28#ifndef _AVX512VLINTRIN_H_INCLUDED
29#define _AVX512VLINTRIN_H_INCLUDED
32#pragma GCC push_options
33#pragma GCC target("avx512vl")
34#define __DISABLE_AVX512VL__
8328#pragma GCC push_options
8329#pragma GCC target("avx512vl,avx512cd")
8330#define __DISABLE_AVX512VLCD__
8601#ifdef __DISABLE_AVX512VLCD__
8602#pragma GCC pop_options
10582 (
unsigned char)
__imm,
10596 (
unsigned char)
__imm,
10610 (
unsigned char)
__imm,
10623 (
unsigned char)
__imm,
10637 (
unsigned char)
__imm,
10651 (
unsigned char)
__imm,
10664 (
unsigned char)
__imm,
10678 (
unsigned char)
__imm,
10692 (
unsigned char)
__imm,
10705 (
unsigned char)
__imm,
10719 (
unsigned char)
__imm,
10733 (
unsigned char)
__imm,
12576#define _mm256_permutex_pd(X, M) \
12577 ((__m256d) __builtin_ia32_permdf256_mask ((__v4df)(__m256d)(X), (int)(M), \
12578 (__v4df)(__m256d) \
12579 _mm256_undefined_pd (), \
12582#define _mm256_permutex_epi64(X, I) \
12583 ((__m256i) __builtin_ia32_permdi256_mask ((__v4di)(__m256i)(X), \
12585 (__v4di)(__m256i) \
12586 (_mm256_setzero_si256 ()),\
12589#define _mm256_maskz_permutex_epi64(M, X, I) \
12590 ((__m256i) __builtin_ia32_permdi256_mask ((__v4di)(__m256i)(X), \
12592 (__v4di)(__m256i) \
12593 (_mm256_setzero_si256 ()),\
12596#define _mm256_mask_permutex_epi64(W, M, X, I) \
12597 ((__m256i) __builtin_ia32_permdi256_mask ((__v4di)(__m256i)(X), \
12599 (__v4di)(__m256i)(W), \
12602#define _mm256_insertf32x4(X, Y, C) \
12603 ((__m256) __builtin_ia32_insertf32x4_256_mask ((__v8sf)(__m256) (X), \
12604 (__v4sf)(__m128) (Y), (int) (C), \
12605 (__v8sf)(__m256)_mm256_setzero_ps (), \
12608#define _mm256_mask_insertf32x4(W, U, X, Y, C) \
12609 ((__m256) __builtin_ia32_insertf32x4_256_mask ((__v8sf)(__m256) (X), \
12610 (__v4sf)(__m128) (Y), (int) (C), \
12611 (__v8sf)(__m256)(W), \
12614#define _mm256_maskz_insertf32x4(U, X, Y, C) \
12615 ((__m256) __builtin_ia32_insertf32x4_256_mask ((__v8sf)(__m256) (X), \
12616 (__v4sf)(__m128) (Y), (int) (C), \
12617 (__v8sf)(__m256)_mm256_setzero_ps (), \
12620#define _mm256_inserti32x4(X, Y, C) \
12621 ((__m256i) __builtin_ia32_inserti32x4_256_mask ((__v8si)(__m256i) (X),\
12622 (__v4si)(__m128i) (Y), (int) (C), \
12623 (__v8si)(__m256i)_mm256_setzero_si256 (), \
12626#define _mm256_mask_inserti32x4(W, U, X, Y, C) \
12627 ((__m256i) __builtin_ia32_inserti32x4_256_mask ((__v8si)(__m256i) (X),\
12628 (__v4si)(__m128i) (Y), (int) (C), \
12629 (__v8si)(__m256i)(W), \
12632#define _mm256_maskz_inserti32x4(U, X, Y, C) \
12633 ((__m256i) __builtin_ia32_inserti32x4_256_mask ((__v8si)(__m256i) (X),\
12634 (__v4si)(__m128i) (Y), (int) (C), \
12635 (__v8si)(__m256i)_mm256_setzero_si256 (), \
12638#define _mm256_extractf32x4_ps(X, C) \
12639 ((__m128) __builtin_ia32_extractf32x4_256_mask ((__v8sf)(__m256) (X), \
12641 (__v4sf)(__m128)_mm_setzero_ps (), \
12644#define _mm256_mask_extractf32x4_ps(W, U, X, C) \
12645 ((__m128) __builtin_ia32_extractf32x4_256_mask ((__v8sf)(__m256) (X), \
12647 (__v4sf)(__m128)(W), \
12650#define _mm256_maskz_extractf32x4_ps(U, X, C) \
12651 ((__m128) __builtin_ia32_extractf32x4_256_mask ((__v8sf)(__m256) (X), \
12653 (__v4sf)(__m128)_mm_setzero_ps (), \
12656#define _mm256_extracti32x4_epi32(X, C) \
12657 ((__m128i) __builtin_ia32_extracti32x4_256_mask ((__v8si)(__m256i) (X),\
12658 (int) (C), (__v4si)(__m128i)_mm_setzero_si128 (), (__mmask8)-1))
12660#define _mm256_mask_extracti32x4_epi32(W, U, X, C) \
12661 ((__m128i) __builtin_ia32_extracti32x4_256_mask ((__v8si)(__m256i) (X),\
12662 (int) (C), (__v4si)(__m128i)(W), (__mmask8)(U)))
12664#define _mm256_maskz_extracti32x4_epi32(U, X, C) \
12665 ((__m128i) __builtin_ia32_extracti32x4_256_mask ((__v8si)(__m256i) (X),\
12666 (int) (C), (__v4si)(__m128i)_mm_setzero_si128 (), (__mmask8)(U)))
12668#define _mm256_shuffle_i64x2(X, Y, C) \
12669 ((__m256i) __builtin_ia32_shuf_i64x2_256_mask ((__v4di)(__m256i)(X), \
12670 (__v4di)(__m256i)(Y), (int)(C), \
12671 (__v4di)(__m256i)_mm256_setzero_si256 (), \
12674#define _mm256_mask_shuffle_i64x2(W, U, X, Y, C) \
12675 ((__m256i) __builtin_ia32_shuf_i64x2_256_mask ((__v4di)(__m256i)(X), \
12676 (__v4di)(__m256i)(Y), (int)(C), \
12677 (__v4di)(__m256i)(W),\
12680#define _mm256_maskz_shuffle_i64x2(U, X, Y, C) \
12681 ((__m256i) __builtin_ia32_shuf_i64x2_256_mask ((__v4di)(__m256i)(X), \
12682 (__v4di)(__m256i)(Y), (int)(C), \
12683 (__v4di)(__m256i)_mm256_setzero_si256 (), \
12686#define _mm256_shuffle_i32x4(X, Y, C) \
12687 ((__m256i) __builtin_ia32_shuf_i32x4_256_mask ((__v8si)(__m256i)(X), \
12688 (__v8si)(__m256i)(Y), (int)(C), \
12689 (__v8si)(__m256i) \
12690 _mm256_setzero_si256 (), \
12693#define _mm256_mask_shuffle_i32x4(W, U, X, Y, C) \
12694 ((__m256i) __builtin_ia32_shuf_i32x4_256_mask ((__v8si)(__m256i)(X), \
12695 (__v8si)(__m256i)(Y), (int)(C), \
12696 (__v8si)(__m256i)(W), \
12699#define _mm256_maskz_shuffle_i32x4(U, X, Y, C) \
12700 ((__m256i) __builtin_ia32_shuf_i32x4_256_mask ((__v8si)(__m256i)(X), \
12701 (__v8si)(__m256i)(Y), (int)(C), \
12702 (__v8si)(__m256i) \
12703 _mm256_setzero_si256 (), \
12706#define _mm256_shuffle_f64x2(X, Y, C) \
12707 ((__m256d) __builtin_ia32_shuf_f64x2_256_mask ((__v4df)(__m256d)(X), \
12708 (__v4df)(__m256d)(Y), (int)(C), \
12709 (__v4df)(__m256d)_mm256_setzero_pd (),\
12712#define _mm256_mask_shuffle_f64x2(W, U, X, Y, C) \
12713 ((__m256d) __builtin_ia32_shuf_f64x2_256_mask ((__v4df)(__m256d)(X), \
12714 (__v4df)(__m256d)(Y), (int)(C), \
12715 (__v4df)(__m256d)(W), \
12718#define _mm256_maskz_shuffle_f64x2(U, X, Y, C) \
12719 ((__m256d) __builtin_ia32_shuf_f64x2_256_mask ((__v4df)(__m256d)(X), \
12720 (__v4df)(__m256d)(Y), (int)(C), \
12721 (__v4df)(__m256d)_mm256_setzero_pd( ),\
12724#define _mm256_shuffle_f32x4(X, Y, C) \
12725 ((__m256) __builtin_ia32_shuf_f32x4_256_mask ((__v8sf)(__m256)(X), \
12726 (__v8sf)(__m256)(Y), (int)(C), \
12727 (__v8sf)(__m256)_mm256_setzero_ps (), \
12730#define _mm256_mask_shuffle_f32x4(W, U, X, Y, C) \
12731 ((__m256) __builtin_ia32_shuf_f32x4_256_mask ((__v8sf)(__m256)(X), \
12732 (__v8sf)(__m256)(Y), (int)(C), \
12733 (__v8sf)(__m256)(W), \
12736#define _mm256_maskz_shuffle_f32x4(U, X, Y, C) \
12737 ((__m256) __builtin_ia32_shuf_f32x4_256_mask ((__v8sf)(__m256)(X), \
12738 (__v8sf)(__m256)(Y), (int)(C), \
12739 (__v8sf)(__m256)_mm256_setzero_ps (), \
12742#define _mm256_mask_shuffle_pd(W, U, A, B, C) \
12743 ((__m256d)__builtin_ia32_shufpd256_mask ((__v4df)(__m256d)(A), \
12744 (__v4df)(__m256d)(B), (int)(C), \
12745 (__v4df)(__m256d)(W), \
12748#define _mm256_maskz_shuffle_pd(U, A, B, C) \
12749 ((__m256d)__builtin_ia32_shufpd256_mask ((__v4df)(__m256d)(A), \
12750 (__v4df)(__m256d)(B), (int)(C), \
12751 (__v4df)(__m256d) \
12752 _mm256_setzero_pd (), \
12755#define _mm_mask_shuffle_pd(W, U, A, B, C) \
12756 ((__m128d)__builtin_ia32_shufpd128_mask ((__v2df)(__m128d)(A), \
12757 (__v2df)(__m128d)(B), (int)(C), \
12758 (__v2df)(__m128d)(W), \
12761#define _mm_maskz_shuffle_pd(U, A, B, C) \
12762 ((__m128d)__builtin_ia32_shufpd128_mask ((__v2df)(__m128d)(A), \
12763 (__v2df)(__m128d)(B), (int)(C), \
12764 (__v2df)(__m128d)_mm_setzero_pd (), \
12767#define _mm256_mask_shuffle_ps(W, U, A, B, C) \
12768 ((__m256) __builtin_ia32_shufps256_mask ((__v8sf)(__m256)(A), \
12769 (__v8sf)(__m256)(B), (int)(C), \
12770 (__v8sf)(__m256)(W), \
12773#define _mm256_maskz_shuffle_ps(U, A, B, C) \
12774 ((__m256) __builtin_ia32_shufps256_mask ((__v8sf)(__m256)(A), \
12775 (__v8sf)(__m256)(B), (int)(C), \
12776 (__v8sf)(__m256)_mm256_setzero_ps (),\
12779#define _mm_mask_shuffle_ps(W, U, A, B, C) \
12780 ((__m128) __builtin_ia32_shufps128_mask ((__v4sf)(__m128)(A), \
12781 (__v4sf)(__m128)(B), (int)(C), \
12782 (__v4sf)(__m128)(W), \
12785#define _mm_maskz_shuffle_ps(U, A, B, C) \
12786 ((__m128) __builtin_ia32_shufps128_mask ((__v4sf)(__m128)(A), \
12787 (__v4sf)(__m128)(B), (int)(C), \
12788 (__v4sf)(__m128)_mm_setzero_ps (), \
12791#define _mm256_fixupimm_pd(X, Y, Z, C) \
12792 ((__m256d)__builtin_ia32_fixupimmpd256_mask ((__v4df)(__m256d)(X), \
12793 (__v4df)(__m256d)(Y), \
12794 (__v4di)(__m256i)(Z), (int)(C), \
12797#define _mm256_mask_fixupimm_pd(X, U, Y, Z, C) \
12798 ((__m256d)__builtin_ia32_fixupimmpd256_mask ((__v4df)(__m256d)(X), \
12799 (__v4df)(__m256d)(Y), \
12800 (__v4di)(__m256i)(Z), (int)(C), \
12803#define _mm256_maskz_fixupimm_pd(U, X, Y, Z, C) \
12804 ((__m256d)__builtin_ia32_fixupimmpd256_maskz ((__v4df)(__m256d)(X), \
12805 (__v4df)(__m256d)(Y), \
12806 (__v4di)(__m256i)(Z), (int)(C),\
12809#define _mm256_fixupimm_ps(X, Y, Z, C) \
12810 ((__m256)__builtin_ia32_fixupimmps256_mask ((__v8sf)(__m256)(X), \
12811 (__v8sf)(__m256)(Y), \
12812 (__v8si)(__m256i)(Z), (int)(C), \
12816#define _mm256_mask_fixupimm_ps(X, U, Y, Z, C) \
12817 ((__m256)__builtin_ia32_fixupimmps256_mask ((__v8sf)(__m256)(X), \
12818 (__v8sf)(__m256)(Y), \
12819 (__v8si)(__m256i)(Z), (int)(C), \
12822#define _mm256_maskz_fixupimm_ps(U, X, Y, Z, C) \
12823 ((__m256)__builtin_ia32_fixupimmps256_maskz ((__v8sf)(__m256)(X), \
12824 (__v8sf)(__m256)(Y), \
12825 (__v8si)(__m256i)(Z), (int)(C),\
12828#define _mm_fixupimm_pd(X, Y, Z, C) \
12829 ((__m128d)__builtin_ia32_fixupimmpd128_mask ((__v2df)(__m128d)(X), \
12830 (__v2df)(__m128d)(Y), \
12831 (__v2di)(__m128i)(Z), (int)(C), \
12835#define _mm_mask_fixupimm_pd(X, U, Y, Z, C) \
12836 ((__m128d)__builtin_ia32_fixupimmpd128_mask ((__v2df)(__m128d)(X), \
12837 (__v2df)(__m128d)(Y), \
12838 (__v2di)(__m128i)(Z), (int)(C), \
12841#define _mm_maskz_fixupimm_pd(U, X, Y, Z, C) \
12842 ((__m128d)__builtin_ia32_fixupimmpd128_maskz ((__v2df)(__m128d)(X), \
12843 (__v2df)(__m128d)(Y), \
12844 (__v2di)(__m128i)(Z), (int)(C),\
12847#define _mm_fixupimm_ps(X, Y, Z, C) \
12848 ((__m128)__builtin_ia32_fixupimmps128_mask ((__v4sf)(__m128)(X), \
12849 (__v4sf)(__m128)(Y), \
12850 (__v4si)(__m128i)(Z), (int)(C), \
12853#define _mm_mask_fixupimm_ps(X, U, Y, Z, C) \
12854 ((__m128)__builtin_ia32_fixupimmps128_mask ((__v4sf)(__m128)(X), \
12855 (__v4sf)(__m128)(Y), \
12856 (__v4si)(__m128i)(Z), (int)(C),\
12859#define _mm_maskz_fixupimm_ps(U, X, Y, Z, C) \
12860 ((__m128)__builtin_ia32_fixupimmps128_maskz ((__v4sf)(__m128)(X), \
12861 (__v4sf)(__m128)(Y), \
12862 (__v4si)(__m128i)(Z), (int)(C),\
12865#define _mm256_mask_srli_epi32(W, U, A, B) \
12866 ((__m256i) __builtin_ia32_psrldi256_mask ((__v8si)(__m256i)(A), \
12867 (int)(B), (__v8si)(__m256i)(W), (__mmask8)(U)))
12869#define _mm256_maskz_srli_epi32(U, A, B) \
12870 ((__m256i) __builtin_ia32_psrldi256_mask ((__v8si)(__m256i)(A), \
12871 (int)(B), (__v8si)_mm256_setzero_si256 (), (__mmask8)(U)))
12873#define _mm_mask_srli_epi32(W, U, A, B) \
12874 ((__m128i) __builtin_ia32_psrldi128_mask ((__v4si)(__m128i)(A), \
12875 (int)(B), (__v4si)(__m128i)(W), (__mmask8)(U)))
12877#define _mm_maskz_srli_epi32(U, A, B) \
12878 ((__m128i) __builtin_ia32_psrldi128_mask ((__v4si)(__m128i)(A), \
12879 (int)(B), (__v4si)_mm_setzero_si128 (), (__mmask8)(U)))
12881#define _mm256_mask_srli_epi64(W, U, A, B) \
12882 ((__m256i) __builtin_ia32_psrlqi256_mask ((__v4di)(__m256i)(A), \
12883 (int)(B), (__v4di)(__m256i)(W), (__mmask8)(U)))
12885#define _mm256_maskz_srli_epi64(U, A, B) \
12886 ((__m256i) __builtin_ia32_psrlqi256_mask ((__v4di)(__m256i)(A), \
12887 (int)(B), (__v4di)_mm256_setzero_si256 (), (__mmask8)(U)))
12889#define _mm_mask_srli_epi64(W, U, A, B) \
12890 ((__m128i) __builtin_ia32_psrlqi128_mask ((__v2di)(__m128i)(A), \
12891 (int)(B), (__v2di)(__m128i)(W), (__mmask8)(U)))
12893#define _mm_maskz_srli_epi64(U, A, B) \
12894 ((__m128i) __builtin_ia32_psrlqi128_mask ((__v2di)(__m128i)(A), \
12895 (int)(B), (__v2di)_mm_setzero_si128 (), (__mmask8)(U)))
12897#define _mm256_mask_slli_epi32(W, U, X, C) \
12898 ((__m256i)__builtin_ia32_pslldi256_mask ((__v8si)(__m256i)(X), (int)(C),\
12899 (__v8si)(__m256i)(W), \
12902#define _mm256_maskz_slli_epi32(U, X, C) \
12903 ((__m256i)__builtin_ia32_pslldi256_mask ((__v8si)(__m256i)(X), (int)(C),\
12904 (__v8si)(__m256i)_mm256_setzero_si256 (), \
12907#define _mm256_mask_slli_epi64(W, U, X, C) \
12908 ((__m256i)__builtin_ia32_psllqi256_mask ((__v4di)(__m256i)(X), (int)(C),\
12909 (__v4di)(__m256i)(W), \
12912#define _mm256_maskz_slli_epi64(U, X, C) \
12913 ((__m256i)__builtin_ia32_psllqi256_mask ((__v4di)(__m256i)(X), (int)(C),\
12914 (__v4di)(__m256i)_mm256_setzero_si256 (), \
12917#define _mm_mask_slli_epi32(W, U, X, C) \
12918 ((__m128i)__builtin_ia32_pslldi128_mask ((__v4si)(__m128i)(X), (int)(C),\
12919 (__v4si)(__m128i)(W),\
12922#define _mm_maskz_slli_epi32(U, X, C) \
12923 ((__m128i)__builtin_ia32_pslldi128_mask ((__v4si)(__m128i)(X), (int)(C),\
12924 (__v4si)(__m128i)_mm_setzero_si128 (),\
12927#define _mm_mask_slli_epi64(W, U, X, C) \
12928 ((__m128i)__builtin_ia32_psllqi128_mask ((__v2di)(__m128i)(X), (int)(C),\
12929 (__v2di)(__m128i)(W),\
12932#define _mm_maskz_slli_epi64(U, X, C) \
12933 ((__m128i)__builtin_ia32_psllqi128_mask ((__v2di)(__m128i)(X), (int)(C),\
12934 (__v2di)(__m128i)_mm_setzero_si128 (),\
12937#define _mm256_ternarylogic_epi64(A, B, C, I) \
12939 __builtin_ia32_pternlogq256_mask ((__v4di) (__m256i) (A), \
12940 (__v4di) (__m256i) (B), \
12941 (__v4di) (__m256i) (C), \
12942 (unsigned char) (I), \
12945#define _mm256_mask_ternarylogic_epi64(A, U, B, C, I) \
12947 __builtin_ia32_pternlogq256_mask ((__v4di) (__m256i) (A), \
12948 (__v4di) (__m256i) (B), \
12949 (__v4di) (__m256i) (C), \
12950 (unsigned char) (I), \
12953#define _mm256_maskz_ternarylogic_epi64(U, A, B, C, I) \
12955 __builtin_ia32_pternlogq256_maskz ((__v4di) (__m256i) (A), \
12956 (__v4di) (__m256i) (B), \
12957 (__v4di) (__m256i) (C), \
12958 (unsigned char) (I), \
12961#define _mm256_ternarylogic_epi32(A, B, C, I) \
12963 __builtin_ia32_pternlogd256_mask ((__v8si) (__m256i) (A), \
12964 (__v8si) (__m256i) (B), \
12965 (__v8si) (__m256i) (C), \
12966 (unsigned char) (I), \
12969#define _mm256_mask_ternarylogic_epi32(A, U, B, C, I) \
12971 __builtin_ia32_pternlogd256_mask ((__v8si) (__m256i) (A), \
12972 (__v8si) (__m256i) (B), \
12973 (__v8si) (__m256i) (C), \
12974 (unsigned char) (I), \
12977#define _mm256_maskz_ternarylogic_epi32(U, A, B, C, I) \
12979 __builtin_ia32_pternlogd256_maskz ((__v8si) (__m256i) (A), \
12980 (__v8si) (__m256i) (B), \
12981 (__v8si) (__m256i) (C), \
12982 (unsigned char) (I), \
12985#define _mm_ternarylogic_epi64(A, B, C, I) \
12987 __builtin_ia32_pternlogq128_mask ((__v2di) (__m128i) (A), \
12988 (__v2di) (__m128i) (B), \
12989 (__v2di) (__m128i) (C), \
12990 (unsigned char) (I), \
12993#define _mm_mask_ternarylogic_epi64(A, U, B, C, I) \
12995 __builtin_ia32_pternlogq128_mask ((__v2di) (__m128i) (A), \
12996 (__v2di) (__m128i) (B), \
12997 (__v2di) (__m128i) (C), \
12998 (unsigned char) (I), \
13001#define _mm_maskz_ternarylogic_epi64(U, A, B, C, I) \
13003 __builtin_ia32_pternlogq128_maskz ((__v2di) (__m128i) (A), \
13004 (__v2di) (__m128i) (B), \
13005 (__v2di) (__m128i) (C), \
13006 (unsigned char) (I), \
13009#define _mm_ternarylogic_epi32(A, B, C, I) \
13011 __builtin_ia32_pternlogd128_mask ((__v4si) (__m128i) (A), \
13012 (__v4si) (__m128i) (B), \
13013 (__v4si) (__m128i) (C), \
13014 (unsigned char) (I), \
13017#define _mm_mask_ternarylogic_epi32(A, U, B, C, I) \
13019 __builtin_ia32_pternlogd128_mask ((__v4si) (__m128i) (A), \
13020 (__v4si) (__m128i) (B), \
13021 (__v4si) (__m128i) (C), \
13022 (unsigned char) (I), \
13025#define _mm_maskz_ternarylogic_epi32(U, A, B, C, I) \
13027 __builtin_ia32_pternlogd128_maskz ((__v4si) (__m128i) (A), \
13028 (__v4si) (__m128i) (B), \
13029 (__v4si) (__m128i) (C), \
13030 (unsigned char) (I), \
13033#define _mm256_roundscale_ps(A, B) \
13034 ((__m256) __builtin_ia32_rndscaleps_256_mask ((__v8sf)(__m256)(A), \
13035 (int)(B), (__v8sf)(__m256)_mm256_setzero_ps (), (__mmask8)-1))
13037#define _mm256_mask_roundscale_ps(W, U, A, B) \
13038 ((__m256) __builtin_ia32_rndscaleps_256_mask ((__v8sf)(__m256)(A), \
13039 (int)(B), (__v8sf)(__m256)(W), (__mmask8)(U)))
13041#define _mm256_maskz_roundscale_ps(U, A, B) \
13042 ((__m256) __builtin_ia32_rndscaleps_256_mask ((__v8sf)(__m256)(A), \
13043 (int)(B), (__v8sf)(__m256)_mm256_setzero_ps (), (__mmask8)(U)))
13045#define _mm256_roundscale_pd(A, B) \
13046 ((__m256d) __builtin_ia32_rndscalepd_256_mask ((__v4df)(__m256d)(A), \
13047 (int)(B), (__v4df)(__m256d)_mm256_setzero_pd (), (__mmask8)-1))
13049#define _mm256_mask_roundscale_pd(W, U, A, B) \
13050 ((__m256d) __builtin_ia32_rndscalepd_256_mask ((__v4df)(__m256d)(A), \
13051 (int)(B), (__v4df)(__m256d)(W), (__mmask8)(U)))
13053#define _mm256_maskz_roundscale_pd(U, A, B) \
13054 ((__m256d) __builtin_ia32_rndscalepd_256_mask ((__v4df)(__m256d)(A), \
13055 (int)(B), (__v4df)(__m256d)_mm256_setzero_pd (), (__mmask8)(U)))
13057#define _mm_roundscale_ps(A, B) \
13058 ((__m128) __builtin_ia32_rndscaleps_128_mask ((__v4sf)(__m128)(A), \
13059 (int)(B), (__v4sf)(__m128)_mm_setzero_ps (), (__mmask8)-1))
13061#define _mm_mask_roundscale_ps(W, U, A, B) \
13062 ((__m128) __builtin_ia32_rndscaleps_128_mask ((__v4sf)(__m128)(A), \
13063 (int)(B), (__v4sf)(__m128)(W), (__mmask8)(U)))
13065#define _mm_maskz_roundscale_ps(U, A, B) \
13066 ((__m128) __builtin_ia32_rndscaleps_128_mask ((__v4sf)(__m128)(A), \
13067 (int)(B), (__v4sf)(__m128)_mm_setzero_ps (), (__mmask8)(U)))
13069#define _mm_roundscale_pd(A, B) \
13070 ((__m128d) __builtin_ia32_rndscalepd_128_mask ((__v2df)(__m128d)(A), \
13071 (int)(B), (__v2df)(__m128d)_mm_setzero_pd (), (__mmask8)-1))
13073#define _mm_mask_roundscale_pd(W, U, A, B) \
13074 ((__m128d) __builtin_ia32_rndscalepd_128_mask ((__v2df)(__m128d)(A), \
13075 (int)(B), (__v2df)(__m128d)(W), (__mmask8)(U)))
13077#define _mm_maskz_roundscale_pd(U, A, B) \
13078 ((__m128d) __builtin_ia32_rndscalepd_128_mask ((__v2df)(__m128d)(A), \
13079 (int)(B), (__v2df)(__m128d)_mm_setzero_pd (), (__mmask8)(U)))
13081#define _mm256_getmant_ps(X, B, C) \
13082 ((__m256) __builtin_ia32_getmantps256_mask ((__v8sf)(__m256) (X), \
13083 (int)(((C)<<2) | (B)), \
13084 (__v8sf)(__m256)_mm256_setzero_ps (), \
13087#define _mm256_mask_getmant_ps(W, U, X, B, C) \
13088 ((__m256) __builtin_ia32_getmantps256_mask ((__v8sf)(__m256) (X), \
13089 (int)(((C)<<2) | (B)), \
13090 (__v8sf)(__m256)(W), \
13093#define _mm256_maskz_getmant_ps(U, X, B, C) \
13094 ((__m256) __builtin_ia32_getmantps256_mask ((__v8sf)(__m256) (X), \
13095 (int)(((C)<<2) | (B)), \
13096 (__v8sf)(__m256)_mm256_setzero_ps (), \
13099#define _mm_getmant_ps(X, B, C) \
13100 ((__m128) __builtin_ia32_getmantps128_mask ((__v4sf)(__m128) (X), \
13101 (int)(((C)<<2) | (B)), \
13102 (__v4sf)(__m128)_mm_setzero_ps (), \
13105#define _mm_mask_getmant_ps(W, U, X, B, C) \
13106 ((__m128) __builtin_ia32_getmantps128_mask ((__v4sf)(__m128) (X), \
13107 (int)(((C)<<2) | (B)), \
13108 (__v4sf)(__m128)(W), \
13111#define _mm_maskz_getmant_ps(U, X, B, C) \
13112 ((__m128) __builtin_ia32_getmantps128_mask ((__v4sf)(__m128) (X), \
13113 (int)(((C)<<2) | (B)), \
13114 (__v4sf)(__m128)_mm_setzero_ps (), \
13117#define _mm256_getmant_pd(X, B, C) \
13118 ((__m256d) __builtin_ia32_getmantpd256_mask ((__v4df)(__m256d) (X), \
13119 (int)(((C)<<2) | (B)), \
13120 (__v4df)(__m256d)_mm256_setzero_pd (),\
13123#define _mm256_mask_getmant_pd(W, U, X, B, C) \
13124 ((__m256d) __builtin_ia32_getmantpd256_mask ((__v4df)(__m256d) (X), \
13125 (int)(((C)<<2) | (B)), \
13126 (__v4df)(__m256d)(W), \
13129#define _mm256_maskz_getmant_pd(U, X, B, C) \
13130 ((__m256d) __builtin_ia32_getmantpd256_mask ((__v4df)(__m256d) (X), \
13131 (int)(((C)<<2) | (B)), \
13132 (__v4df)(__m256d)_mm256_setzero_pd (),\
13135#define _mm_getmant_pd(X, B, C) \
13136 ((__m128d) __builtin_ia32_getmantpd128_mask ((__v2df)(__m128d) (X), \
13137 (int)(((C)<<2) | (B)), \
13138 (__v2df)(__m128d)_mm_setzero_pd (), \
13141#define _mm_mask_getmant_pd(W, U, X, B, C) \
13142 ((__m128d) __builtin_ia32_getmantpd128_mask ((__v2df)(__m128d) (X), \
13143 (int)(((C)<<2) | (B)), \
13144 (__v2df)(__m128d)(W), \
13147#define _mm_maskz_getmant_pd(U, X, B, C) \
13148 ((__m128d) __builtin_ia32_getmantpd128_mask ((__v2df)(__m128d) (X), \
13149 (int)(((C)<<2) | (B)), \
13150 (__v2df)(__m128d)_mm_setzero_pd (), \
13153#define _mm256_mmask_i32gather_ps(V1OLD, MASK, INDEX, ADDR, SCALE) \
13154 (__m256) __builtin_ia32_gather3siv8sf ((__v8sf)(__m256) (V1OLD), \
13155 (void const *) (ADDR), \
13156 (__v8si)(__m256i) (INDEX), \
13157 (__mmask8) (MASK), \
13160#define _mm_mmask_i32gather_ps(V1OLD, MASK, INDEX, ADDR, SCALE) \
13161 (__m128) __builtin_ia32_gather3siv4sf ((__v4sf)(__m128) (V1OLD), \
13162 (void const *) (ADDR), \
13163 (__v4si)(__m128i) (INDEX), \
13164 (__mmask8) (MASK), \
13167#define _mm256_mmask_i32gather_pd(V1OLD, MASK, INDEX, ADDR, SCALE) \
13168 (__m256d) __builtin_ia32_gather3siv4df ((__v4df)(__m256d) (V1OLD), \
13169 (void const *) (ADDR), \
13170 (__v4si)(__m128i) (INDEX), \
13171 (__mmask8) (MASK), \
13174#define _mm_mmask_i32gather_pd(V1OLD, MASK, INDEX, ADDR, SCALE) \
13175 (__m128d) __builtin_ia32_gather3siv2df ((__v2df)(__m128d) (V1OLD), \
13176 (void const *) (ADDR), \
13177 (__v4si)(__m128i) (INDEX), \
13178 (__mmask8) (MASK), \
13181#define _mm256_mmask_i64gather_ps(V1OLD, MASK, INDEX, ADDR, SCALE) \
13182 (__m128) __builtin_ia32_gather3div8sf ((__v4sf)(__m128) (V1OLD), \
13183 (void const *) (ADDR), \
13184 (__v4di)(__m256i) (INDEX), \
13185 (__mmask8) (MASK), \
13188#define _mm_mmask_i64gather_ps(V1OLD, MASK, INDEX, ADDR, SCALE) \
13189 (__m128) __builtin_ia32_gather3div4sf ((__v4sf)(__m128) (V1OLD), \
13190 (void const *) (ADDR), \
13191 (__v2di)(__m128i) (INDEX), \
13192 (__mmask8) (MASK), \
13195#define _mm256_mmask_i64gather_pd(V1OLD, MASK, INDEX, ADDR, SCALE) \
13196 (__m256d) __builtin_ia32_gather3div4df ((__v4df)(__m256d) (V1OLD), \
13197 (void const *) (ADDR), \
13198 (__v4di)(__m256i) (INDEX), \
13199 (__mmask8) (MASK), \
13202#define _mm_mmask_i64gather_pd(V1OLD, MASK, INDEX, ADDR, SCALE) \
13203 (__m128d) __builtin_ia32_gather3div2df ((__v2df)(__m128d) (V1OLD), \
13204 (void const *) (ADDR), \
13205 (__v2di)(__m128i) (INDEX), \
13206 (__mmask8) (MASK), \
13209#define _mm256_mmask_i32gather_epi32(V1OLD, MASK, INDEX, ADDR, SCALE) \
13210 (__m256i) __builtin_ia32_gather3siv8si ((__v8si)(__m256i) (V1OLD), \
13211 (void const *) (ADDR), \
13212 (__v8si)(__m256i) (INDEX), \
13213 (__mmask8) (MASK), \
13216#define _mm_mmask_i32gather_epi32(V1OLD, MASK, INDEX, ADDR, SCALE) \
13217 (__m128i) __builtin_ia32_gather3siv4si ((__v4si)(__m128i) (V1OLD), \
13218 (void const *) (ADDR), \
13219 (__v4si)(__m128i) (INDEX), \
13220 (__mmask8) (MASK), \
13223#define _mm256_mmask_i32gather_epi64(V1OLD, MASK, INDEX, ADDR, SCALE) \
13224 (__m256i) __builtin_ia32_gather3siv4di ((__v4di)(__m256i) (V1OLD), \
13225 (void const *) (ADDR), \
13226 (__v4si)(__m128i) (INDEX), \
13227 (__mmask8) (MASK), \
13230#define _mm_mmask_i32gather_epi64(V1OLD, MASK, INDEX, ADDR, SCALE) \
13231 (__m128i) __builtin_ia32_gather3siv2di ((__v2di)(__m128i) (V1OLD), \
13232 (void const *) (ADDR), \
13233 (__v4si)(__m128i) (INDEX), \
13234 (__mmask8) (MASK), \
13237#define _mm256_mmask_i64gather_epi32(V1OLD, MASK, INDEX, ADDR, SCALE) \
13238 (__m128i) __builtin_ia32_gather3div8si ((__v4si)(__m128i) (V1OLD), \
13239 (void const *) (ADDR), \
13240 (__v4di)(__m256i) (INDEX), \
13241 (__mmask8) (MASK), \
13244#define _mm_mmask_i64gather_epi32(V1OLD, MASK, INDEX, ADDR, SCALE) \
13245 (__m128i) __builtin_ia32_gather3div4si ((__v4si)(__m128i) (V1OLD), \
13246 (void const *) (ADDR), \
13247 (__v2di)(__m128i) (INDEX), \
13248 (__mmask8) (MASK), \
13251#define _mm256_mmask_i64gather_epi64(V1OLD, MASK, INDEX, ADDR, SCALE) \
13252 (__m256i) __builtin_ia32_gather3div4di ((__v4di)(__m256i) (V1OLD), \
13253 (void const *) (ADDR), \
13254 (__v4di)(__m256i) (INDEX), \
13255 (__mmask8) (MASK), \
13258#define _mm_mmask_i64gather_epi64(V1OLD, MASK, INDEX, ADDR, SCALE) \
13259 (__m128i) __builtin_ia32_gather3div2di ((__v2di)(__m128i) (V1OLD), \
13260 (void const *) (ADDR), \
13261 (__v2di)(__m128i) (INDEX), \
13262 (__mmask8) (MASK), \
13265#define _mm256_i32scatter_ps(ADDR, INDEX, V1, SCALE) \
13266 __builtin_ia32_scattersiv8sf ((void *) (ADDR), (__mmask8)0xFF, \
13267 (__v8si)(__m256i) (INDEX), \
13268 (__v8sf)(__m256) (V1), (int) (SCALE))
13270#define _mm256_mask_i32scatter_ps(ADDR, MASK, INDEX, V1, SCALE) \
13271 __builtin_ia32_scattersiv8sf ((void *) (ADDR), (__mmask8) (MASK), \
13272 (__v8si)(__m256i) (INDEX), \
13273 (__v8sf)(__m256) (V1), (int) (SCALE))
13275#define _mm_i32scatter_ps(ADDR, INDEX, V1, SCALE) \
13276 __builtin_ia32_scattersiv4sf ((void *) (ADDR), (__mmask8)0xFF, \
13277 (__v4si)(__m128i) (INDEX), \
13278 (__v4sf)(__m128) (V1), (int) (SCALE))
13280#define _mm_mask_i32scatter_ps(ADDR, MASK, INDEX, V1, SCALE) \
13281 __builtin_ia32_scattersiv4sf ((void *) (ADDR), (__mmask8) (MASK), \
13282 (__v4si)(__m128i) (INDEX), \
13283 (__v4sf)(__m128) (V1), (int) (SCALE))
13285#define _mm256_i32scatter_pd(ADDR, INDEX, V1, SCALE) \
13286 __builtin_ia32_scattersiv4df ((void *) (ADDR), (__mmask8)0xFF, \
13287 (__v4si)(__m128i) (INDEX), \
13288 (__v4df)(__m256d) (V1), (int) (SCALE))
13290#define _mm256_mask_i32scatter_pd(ADDR, MASK, INDEX, V1, SCALE) \
13291 __builtin_ia32_scattersiv4df ((void *) (ADDR), (__mmask8) (MASK), \
13292 (__v4si)(__m128i) (INDEX), \
13293 (__v4df)(__m256d) (V1), (int) (SCALE))
13295#define _mm_i32scatter_pd(ADDR, INDEX, V1, SCALE) \
13296 __builtin_ia32_scattersiv2df ((void *) (ADDR), (__mmask8)0xFF, \
13297 (__v4si)(__m128i) (INDEX), \
13298 (__v2df)(__m128d) (V1), (int) (SCALE))
13300#define _mm_mask_i32scatter_pd(ADDR, MASK, INDEX, V1, SCALE) \
13301 __builtin_ia32_scattersiv2df ((void *) (ADDR), (__mmask8) (MASK), \
13302 (__v4si)(__m128i) (INDEX), \
13303 (__v2df)(__m128d) (V1), (int) (SCALE))
13305#define _mm256_i64scatter_ps(ADDR, INDEX, V1, SCALE) \
13306 __builtin_ia32_scatterdiv8sf ((void *) (ADDR), (__mmask8)0xFF, \
13307 (__v4di)(__m256i) (INDEX), \
13308 (__v4sf)(__m128) (V1), (int) (SCALE))
13310#define _mm256_mask_i64scatter_ps(ADDR, MASK, INDEX, V1, SCALE) \
13311 __builtin_ia32_scatterdiv8sf ((void *) (ADDR), (__mmask8) (MASK), \
13312 (__v4di)(__m256i) (INDEX), \
13313 (__v4sf)(__m128) (V1), (int) (SCALE))
13315#define _mm_i64scatter_ps(ADDR, INDEX, V1, SCALE) \
13316 __builtin_ia32_scatterdiv4sf ((void *) (ADDR), (__mmask8)0xFF, \
13317 (__v2di)(__m128i) (INDEX), \
13318 (__v4sf)(__m128) (V1), (int) (SCALE))
13320#define _mm_mask_i64scatter_ps(ADDR, MASK, INDEX, V1, SCALE) \
13321 __builtin_ia32_scatterdiv4sf ((void *) (ADDR), (__mmask8) (MASK), \
13322 (__v2di)(__m128i) (INDEX), \
13323 (__v4sf)(__m128) (V1), (int) (SCALE))
13325#define _mm256_i64scatter_pd(ADDR, INDEX, V1, SCALE) \
13326 __builtin_ia32_scatterdiv4df ((void *) (ADDR), (__mmask8)0xFF, \
13327 (__v4di)(__m256i) (INDEX), \
13328 (__v4df)(__m256d) (V1), (int) (SCALE))
13330#define _mm256_mask_i64scatter_pd(ADDR, MASK, INDEX, V1, SCALE) \
13331 __builtin_ia32_scatterdiv4df ((void *) (ADDR), (__mmask8) (MASK), \
13332 (__v4di)(__m256i) (INDEX), \
13333 (__v4df)(__m256d) (V1), (int) (SCALE))
13335#define _mm_i64scatter_pd(ADDR, INDEX, V1, SCALE) \
13336 __builtin_ia32_scatterdiv2df ((void *) (ADDR), (__mmask8)0xFF, \
13337 (__v2di)(__m128i) (INDEX), \
13338 (__v2df)(__m128d) (V1), (int) (SCALE))
13340#define _mm_mask_i64scatter_pd(ADDR, MASK, INDEX, V1, SCALE) \
13341 __builtin_ia32_scatterdiv2df ((void *) (ADDR), (__mmask8) (MASK), \
13342 (__v2di)(__m128i) (INDEX), \
13343 (__v2df)(__m128d) (V1), (int) (SCALE))
13345#define _mm256_i32scatter_epi32(ADDR, INDEX, V1, SCALE) \
13346 __builtin_ia32_scattersiv8si ((void *) (ADDR), (__mmask8)0xFF, \
13347 (__v8si)(__m256i) (INDEX), \
13348 (__v8si)(__m256i) (V1), (int) (SCALE))
13350#define _mm256_mask_i32scatter_epi32(ADDR, MASK, INDEX, V1, SCALE) \
13351 __builtin_ia32_scattersiv8si ((void *) (ADDR), (__mmask8) (MASK), \
13352 (__v8si)(__m256i) (INDEX), \
13353 (__v8si)(__m256i) (V1), (int) (SCALE))
13355#define _mm_i32scatter_epi32(ADDR, INDEX, V1, SCALE) \
13356 __builtin_ia32_scattersiv4si ((void *) (ADDR), (__mmask8)0xFF, \
13357 (__v4si)(__m128i) (INDEX), \
13358 (__v4si)(__m128i) (V1), (int) (SCALE))
13360#define _mm_mask_i32scatter_epi32(ADDR, MASK, INDEX, V1, SCALE) \
13361 __builtin_ia32_scattersiv4si ((void *) (ADDR), (__mmask8) (MASK), \
13362 (__v4si)(__m128i) (INDEX), \
13363 (__v4si)(__m128i) (V1), (int) (SCALE))
13365#define _mm256_i32scatter_epi64(ADDR, INDEX, V1, SCALE) \
13366 __builtin_ia32_scattersiv4di ((void *) (ADDR), (__mmask8)0xFF, \
13367 (__v4si)(__m128i) (INDEX), \
13368 (__v4di)(__m256i) (V1), (int) (SCALE))
13370#define _mm256_mask_i32scatter_epi64(ADDR, MASK, INDEX, V1, SCALE) \
13371 __builtin_ia32_scattersiv4di ((void *) (ADDR), (__mmask8) (MASK), \
13372 (__v4si)(__m128i) (INDEX), \
13373 (__v4di)(__m256i) (V1), (int) (SCALE))
13375#define _mm_i32scatter_epi64(ADDR, INDEX, V1, SCALE) \
13376 __builtin_ia32_scattersiv2di ((void *) (ADDR), (__mmask8)0xFF, \
13377 (__v4si)(__m128i) (INDEX), \
13378 (__v2di)(__m128i) (V1), (int) (SCALE))
13380#define _mm_mask_i32scatter_epi64(ADDR, MASK, INDEX, V1, SCALE) \
13381 __builtin_ia32_scattersiv2di ((void *) (ADDR), (__mmask8) (MASK), \
13382 (__v4si)(__m128i) (INDEX), \
13383 (__v2di)(__m128i) (V1), (int) (SCALE))
13385#define _mm256_i64scatter_epi32(ADDR, INDEX, V1, SCALE) \
13386 __builtin_ia32_scatterdiv8si ((void *) (ADDR), (__mmask8)0xFF, \
13387 (__v4di)(__m256i) (INDEX), \
13388 (__v4si)(__m128i) (V1), (int) (SCALE))
13390#define _mm256_mask_i64scatter_epi32(ADDR, MASK, INDEX, V1, SCALE) \
13391 __builtin_ia32_scatterdiv8si ((void *) (ADDR), (__mmask8) (MASK), \
13392 (__v4di)(__m256i) (INDEX), \
13393 (__v4si)(__m128i) (V1), (int) (SCALE))
13395#define _mm_i64scatter_epi32(ADDR, INDEX, V1, SCALE) \
13396 __builtin_ia32_scatterdiv4si ((void *) (ADDR), (__mmask8)0xFF, \
13397 (__v2di)(__m128i) (INDEX), \
13398 (__v4si)(__m128i) (V1), (int) (SCALE))
13400#define _mm_mask_i64scatter_epi32(ADDR, MASK, INDEX, V1, SCALE) \
13401 __builtin_ia32_scatterdiv4si ((void *) (ADDR), (__mmask8) (MASK), \
13402 (__v2di)(__m128i) (INDEX), \
13403 (__v4si)(__m128i) (V1), (int) (SCALE))
13405#define _mm256_i64scatter_epi64(ADDR, INDEX, V1, SCALE) \
13406 __builtin_ia32_scatterdiv4di ((void *) (ADDR), (__mmask8)0xFF, \
13407 (__v4di)(__m256i) (INDEX), \
13408 (__v4di)(__m256i) (V1), (int) (SCALE))
13410#define _mm256_mask_i64scatter_epi64(ADDR, MASK, INDEX, V1, SCALE) \
13411 __builtin_ia32_scatterdiv4di ((void *) (ADDR), (__mmask8) (MASK), \
13412 (__v4di)(__m256i) (INDEX), \
13413 (__v4di)(__m256i) (V1), (int) (SCALE))
13415#define _mm_i64scatter_epi64(ADDR, INDEX, V1, SCALE) \
13416 __builtin_ia32_scatterdiv2di ((void *) (ADDR), (__mmask8)0xFF, \
13417 (__v2di)(__m128i) (INDEX), \
13418 (__v2di)(__m128i) (V1), (int) (SCALE))
13420#define _mm_mask_i64scatter_epi64(ADDR, MASK, INDEX, V1, SCALE) \
13421 __builtin_ia32_scatterdiv2di ((void *) (ADDR), (__mmask8) (MASK), \
13422 (__v2di)(__m128i) (INDEX), \
13423 (__v2di)(__m128i) (V1), (int) (SCALE))
13425#define _mm256_mask_shuffle_epi32(W, U, X, C) \
13426 ((__m256i) __builtin_ia32_pshufd256_mask ((__v8si)(__m256i)(X), (int)(C), \
13427 (__v8si)(__m256i)(W), \
13430#define _mm256_maskz_shuffle_epi32(U, X, C) \
13431 ((__m256i) __builtin_ia32_pshufd256_mask ((__v8si)(__m256i)(X), (int)(C), \
13432 (__v8si)(__m256i) \
13433 _mm256_setzero_si256 (), \
13436#define _mm_mask_shuffle_epi32(W, U, X, C) \
13437 ((__m128i) __builtin_ia32_pshufd128_mask ((__v4si)(__m128i)(X), (int)(C), \
13438 (__v4si)(__m128i)(W), \
13441#define _mm_maskz_shuffle_epi32(U, X, C) \
13442 ((__m128i) __builtin_ia32_pshufd128_mask ((__v4si)(__m128i)(X), (int)(C), \
13443 (__v4si)(__m128i)_mm_setzero_si128 (), \
13446#define _mm256_rol_epi64(A, B) \
13447 ((__m256i)__builtin_ia32_prolq256_mask ((__v4di)(__m256i)(A), (int)(B), \
13448 (__v4di)(__m256i)_mm256_setzero_si256 (),\
13451#define _mm256_mask_rol_epi64(W, U, A, B) \
13452 ((__m256i)__builtin_ia32_prolq256_mask ((__v4di)(__m256i)(A), (int)(B), \
13453 (__v4di)(__m256i)(W), \
13456#define _mm256_maskz_rol_epi64(U, A, B) \
13457 ((__m256i)__builtin_ia32_prolq256_mask ((__v4di)(__m256i)(A), (int)(B), \
13458 (__v4di)(__m256i)_mm256_setzero_si256 (),\
13461#define _mm_rol_epi64(A, B) \
13462 ((__m128i)__builtin_ia32_prolq128_mask ((__v2di)(__m128i)(A), (int)(B), \
13463 (__v2di)(__m128i)_mm_setzero_si128 (),\
13466#define _mm_mask_rol_epi64(W, U, A, B) \
13467 ((__m128i)__builtin_ia32_prolq128_mask ((__v2di)(__m128i)(A), (int)(B), \
13468 (__v2di)(__m128i)(W), \
13471#define _mm_maskz_rol_epi64(U, A, B) \
13472 ((__m128i)__builtin_ia32_prolq128_mask ((__v2di)(__m128i)(A), (int)(B), \
13473 (__v2di)(__m128i)_mm_setzero_si128 (),\
13476#define _mm256_ror_epi64(A, B) \
13477 ((__m256i)__builtin_ia32_prorq256_mask ((__v4di)(__m256i)(A), (int)(B), \
13478 (__v4di)(__m256i)_mm256_setzero_si256 (),\
13481#define _mm256_mask_ror_epi64(W, U, A, B) \
13482 ((__m256i)__builtin_ia32_prorq256_mask ((__v4di)(__m256i)(A), (int)(B), \
13483 (__v4di)(__m256i)(W), \
13486#define _mm256_maskz_ror_epi64(U, A, B) \
13487 ((__m256i)__builtin_ia32_prorq256_mask ((__v4di)(__m256i)(A), (int)(B), \
13488 (__v4di)(__m256i)_mm256_setzero_si256 (),\
13491#define _mm_ror_epi64(A, B) \
13492 ((__m128i)__builtin_ia32_prorq128_mask ((__v2di)(__m128i)(A), (int)(B), \
13493 (__v2di)(__m128i)_mm_setzero_si128 (),\
13496#define _mm_mask_ror_epi64(W, U, A, B) \
13497 ((__m128i)__builtin_ia32_prorq128_mask ((__v2di)(__m128i)(A), (int)(B), \
13498 (__v2di)(__m128i)(W), \
13501#define _mm_maskz_ror_epi64(U, A, B) \
13502 ((__m128i)__builtin_ia32_prorq128_mask ((__v2di)(__m128i)(A), (int)(B), \
13503 (__v2di)(__m128i)_mm_setzero_si128 (),\
13506#define _mm256_rol_epi32(A, B) \
13507 ((__m256i)__builtin_ia32_prold256_mask ((__v8si)(__m256i)(A), (int)(B), \
13508 (__v8si)(__m256i)_mm256_setzero_si256 (),\
13511#define _mm256_mask_rol_epi32(W, U, A, B) \
13512 ((__m256i)__builtin_ia32_prold256_mask ((__v8si)(__m256i)(A), (int)(B), \
13513 (__v8si)(__m256i)(W), \
13516#define _mm256_maskz_rol_epi32(U, A, B) \
13517 ((__m256i)__builtin_ia32_prold256_mask ((__v8si)(__m256i)(A), (int)(B), \
13518 (__v8si)(__m256i)_mm256_setzero_si256 (),\
13521#define _mm_rol_epi32(A, B) \
13522 ((__m128i)__builtin_ia32_prold128_mask ((__v4si)(__m128i)(A), (int)(B), \
13523 (__v4si)(__m128i)_mm_setzero_si128 (),\
13526#define _mm_mask_rol_epi32(W, U, A, B) \
13527 ((__m128i)__builtin_ia32_prold128_mask ((__v4si)(__m128i)(A), (int)(B), \
13528 (__v4si)(__m128i)(W), \
13531#define _mm_maskz_rol_epi32(U, A, B) \
13532 ((__m128i)__builtin_ia32_prold128_mask ((__v4si)(__m128i)(A), (int)(B), \
13533 (__v4si)(__m128i)_mm_setzero_si128 (),\
13536#define _mm256_ror_epi32(A, B) \
13537 ((__m256i)__builtin_ia32_prord256_mask ((__v8si)(__m256i)(A), (int)(B), \
13538 (__v8si)(__m256i)_mm256_setzero_si256 (),\
13541#define _mm256_mask_ror_epi32(W, U, A, B) \
13542 ((__m256i)__builtin_ia32_prord256_mask ((__v8si)(__m256i)(A), (int)(B), \
13543 (__v8si)(__m256i)(W), \
13546#define _mm256_maskz_ror_epi32(U, A, B) \
13547 ((__m256i)__builtin_ia32_prord256_mask ((__v8si)(__m256i)(A), (int)(B), \
13548 (__v8si)(__m256i) \
13549 _mm256_setzero_si256 (), \
13552#define _mm_ror_epi32(A, B) \
13553 ((__m128i)__builtin_ia32_prord128_mask ((__v4si)(__m128i)(A), (int)(B), \
13554 (__v4si)(__m128i)_mm_setzero_si128 (),\
13557#define _mm_mask_ror_epi32(W, U, A, B) \
13558 ((__m128i)__builtin_ia32_prord128_mask ((__v4si)(__m128i)(A), (int)(B), \
13559 (__v4si)(__m128i)(W), \
13562#define _mm_maskz_ror_epi32(U, A, B) \
13563 ((__m128i)__builtin_ia32_prord128_mask ((__v4si)(__m128i)(A), (int)(B), \
13564 (__v4si)(__m128i)_mm_setzero_si128 (),\
13567#define _mm256_alignr_epi32(X, Y, C) \
13568 ((__m256i)__builtin_ia32_alignd256_mask ((__v8si)(__m256i)(X), \
13569 (__v8si)(__m256i)(Y), (int)(C), (__v8si)(__m256i)(X), (__mmask8)-1))
13571#define _mm256_mask_alignr_epi32(W, U, X, Y, C) \
13572 ((__m256i)__builtin_ia32_alignd256_mask ((__v8si)(__m256i)(X), \
13573 (__v8si)(__m256i)(Y), (int)(C), (__v8si)(__m256i)(W), (__mmask8)(U)))
13575#define _mm256_maskz_alignr_epi32(U, X, Y, C) \
13576 ((__m256i)__builtin_ia32_alignd256_mask ((__v8si)(__m256i)(X), \
13577 (__v8si)(__m256i)(Y), (int)(C), (__v8si)(__m256i)_mm256_setzero_si256 (),\
13580#define _mm256_alignr_epi64(X, Y, C) \
13581 ((__m256i)__builtin_ia32_alignq256_mask ((__v4di)(__m256i)(X), \
13582 (__v4di)(__m256i)(Y), (int)(C), (__v4di)(__m256i)(X), (__mmask8)-1))
13584#define _mm256_mask_alignr_epi64(W, U, X, Y, C) \
13585 ((__m256i)__builtin_ia32_alignq256_mask ((__v4di)(__m256i)(X), \
13586 (__v4di)(__m256i)(Y), (int)(C), (__v4di)(__m256i)(W), (__mmask8)(U)))
13588#define _mm256_maskz_alignr_epi64(U, X, Y, C) \
13589 ((__m256i)__builtin_ia32_alignq256_mask ((__v4di)(__m256i)(X), \
13590 (__v4di)(__m256i)(Y), (int)(C), (__v4di)(__m256i)_mm256_setzero_si256 (),\
13593#define _mm_alignr_epi32(X, Y, C) \
13594 ((__m128i)__builtin_ia32_alignd128_mask ((__v4si)(__m128i)(X), \
13595 (__v4si)(__m128i)(Y), (int)(C), (__v4si)(__m128i)(X), (__mmask8)-1))
13597#define _mm_mask_alignr_epi32(W, U, X, Y, C) \
13598 ((__m128i)__builtin_ia32_alignd128_mask ((__v4si)(__m128i)(X), \
13599 (__v4si)(__m128i)(Y), (int)(C), (__v4si)(__m128i)(W), (__mmask8)(U)))
13601#define _mm_maskz_alignr_epi32(U, X, Y, C) \
13602 ((__m128i)__builtin_ia32_alignd128_mask ((__v4si)(__m128i)(X), \
13603 (__v4si)(__m128i)(Y), (int)(C), (__v4si)(__m128i)_mm_setzero_si128 (),\
13606#define _mm_alignr_epi64(X, Y, C) \
13607 ((__m128i)__builtin_ia32_alignq128_mask ((__v2di)(__m128i)(X), \
13608 (__v2di)(__m128i)(Y), (int)(C), (__v2di)(__m128i)(X), (__mmask8)-1))
13610#define _mm_mask_alignr_epi64(W, U, X, Y, C) \
13611 ((__m128i)__builtin_ia32_alignq128_mask ((__v2di)(__m128i)(X), \
13612 (__v2di)(__m128i)(Y), (int)(C), (__v2di)(__m128i)(X), (__mmask8)-1))
13614#define _mm_maskz_alignr_epi64(U, X, Y, C) \
13615 ((__m128i)__builtin_ia32_alignq128_mask ((__v2di)(__m128i)(X), \
13616 (__v2di)(__m128i)(Y), (int)(C), (__v2di)(__m128i)_mm_setzero_si128 (),\
13619#define _mm_mask_cvtps_ph(W, U, A, I) \
13620 ((__m128i) __builtin_ia32_vcvtps2ph_mask ((__v4sf)(__m128) (A), (int) (I), \
13621 (__v8hi)(__m128i) (W), (__mmask8) (U)))
13623#define _mm_maskz_cvtps_ph(U, A, I) \
13624 ((__m128i) __builtin_ia32_vcvtps2ph_mask ((__v4sf)(__m128) (A), (int) (I), \
13625 (__v8hi)(__m128i) _mm_setzero_si128 (), (__mmask8) (U)))
13627#define _mm256_mask_cvtps_ph(W, U, A, I) \
13628 ((__m128i) __builtin_ia32_vcvtps2ph256_mask ((__v8sf)(__m256) (A), (int) (I), \
13629 (__v8hi)(__m128i) (W), (__mmask8) (U)))
13631#define _mm256_maskz_cvtps_ph(U, A, I) \
13632 ((__m128i) __builtin_ia32_vcvtps2ph256_mask ((__v8sf)(__m256) (A), (int) (I), \
13633 (__v8hi)(__m128i) _mm_setzero_si128 (), (__mmask8) (U)))
13635#define _mm256_mask_srai_epi32(W, U, A, B) \
13636 ((__m256i) __builtin_ia32_psradi256_mask ((__v8si)(__m256i)(A), \
13637 (int)(B), (__v8si)(__m256i)(W), (__mmask8)(U)))
13639#define _mm256_maskz_srai_epi32(U, A, B) \
13640 ((__m256i) __builtin_ia32_psradi256_mask ((__v8si)(__m256i)(A), \
13641 (int)(B), (__v8si)_mm256_setzero_si256 (), (__mmask8)(U)))
13643#define _mm_mask_srai_epi32(W, U, A, B) \
13644 ((__m128i) __builtin_ia32_psradi128_mask ((__v4si)(__m128i)(A), \
13645 (int)(B), (__v4si)(__m128i)(W), (__mmask8)(U)))
13647#define _mm_maskz_srai_epi32(U, A, B) \
13648 ((__m128i) __builtin_ia32_psradi128_mask ((__v4si)(__m128i)(A), \
13649 (int)(B), (__v4si)_mm_setzero_si128 (), (__mmask8)(U)))
13651#define _mm256_srai_epi64(A, B) \
13652 ((__m256i) __builtin_ia32_psraqi256_mask ((__v4di)(__m256i)(A), \
13653 (int)(B), (__v4di)_mm256_setzero_si256 (), (__mmask8)-1))
13655#define _mm256_mask_srai_epi64(W, U, A, B) \
13656 ((__m256i) __builtin_ia32_psraqi256_mask ((__v4di)(__m256i)(A), \
13657 (int)(B), (__v4di)(__m256i)(W), (__mmask8)(U)))
13659#define _mm256_maskz_srai_epi64(U, A, B) \
13660 ((__m256i) __builtin_ia32_psraqi256_mask ((__v4di)(__m256i)(A), \
13661 (int)(B), (__v4di)_mm256_setzero_si256 (), (__mmask8)(U)))
13663#define _mm_srai_epi64(A, B) \
13664 ((__m128i) __builtin_ia32_psraqi128_mask ((__v2di)(__m128i)(A), \
13665 (int)(B), (__v2di)_mm_setzero_si128 (), (__mmask8)-1))
13667#define _mm_mask_srai_epi64(W, U, A, B) \
13668 ((__m128i) __builtin_ia32_psraqi128_mask ((__v2di)(__m128i)(A), \
13669 (int)(B), (__v2di)(__m128i)(W), (__mmask8)(U)))
13671#define _mm_maskz_srai_epi64(U, A, B) \
13672 ((__m128i) __builtin_ia32_psraqi128_mask ((__v2di)(__m128i)(A), \
13673 (int)(B), (__v2di)_mm_setzero_si128 (), (__mmask8)(U)))
13675#define _mm256_mask_permutex_pd(W, U, A, B) \
13676 ((__m256d) __builtin_ia32_permdf256_mask ((__v4df)(__m256d)(A), \
13677 (int)(B), (__v4df)(__m256d)(W), (__mmask8)(U)))
13679#define _mm256_maskz_permutex_pd(U, A, B) \
13680 ((__m256d) __builtin_ia32_permdf256_mask ((__v4df)(__m256d)(A), \
13681 (int)(B), (__v4df)(__m256d)_mm256_setzero_pd (), (__mmask8)(U)))
13683#define _mm256_mask_permute_pd(W, U, X, C) \
13684 ((__m256d) __builtin_ia32_vpermilpd256_mask ((__v4df)(__m256d)(X), (int)(C), \
13685 (__v4df)(__m256d)(W), \
13688#define _mm256_maskz_permute_pd(U, X, C) \
13689 ((__m256d) __builtin_ia32_vpermilpd256_mask ((__v4df)(__m256d)(X), (int)(C), \
13690 (__v4df)(__m256d)_mm256_setzero_pd (),\
13693#define _mm256_mask_permute_ps(W, U, X, C) \
13694 ((__m256) __builtin_ia32_vpermilps256_mask ((__v8sf)(__m256)(X), (int)(C), \
13695 (__v8sf)(__m256)(W), (__mmask8)(U)))
13697#define _mm256_maskz_permute_ps(U, X, C) \
13698 ((__m256) __builtin_ia32_vpermilps256_mask ((__v8sf)(__m256)(X), (int)(C), \
13699 (__v8sf)(__m256)_mm256_setzero_ps (), \
13702#define _mm_mask_permute_pd(W, U, X, C) \
13703 ((__m128d) __builtin_ia32_vpermilpd_mask ((__v2df)(__m128d)(X), (int)(C), \
13704 (__v2df)(__m128d)(W), (__mmask8)(U)))
13706#define _mm_maskz_permute_pd(U, X, C) \
13707 ((__m128d) __builtin_ia32_vpermilpd_mask ((__v2df)(__m128d)(X), (int)(C), \
13708 (__v2df)(__m128d)_mm_setzero_pd (), \
13711#define _mm_mask_permute_ps(W, U, X, C) \
13712 ((__m128) __builtin_ia32_vpermilps_mask ((__v4sf)(__m128)(X), (int)(C), \
13713 (__v4sf)(__m128)(W), (__mmask8)(U)))
13715#define _mm_maskz_permute_ps(U, X, C) \
13716 ((__m128) __builtin_ia32_vpermilps_mask ((__v4sf)(__m128)(X), (int)(C), \
13717 (__v4sf)(__m128)_mm_setzero_ps (), \
13720#define _mm256_mask_blend_pd(__U, __A, __W) \
13721 ((__m256d) __builtin_ia32_blendmpd_256_mask ((__v4df) (__A), \
13725#define _mm256_mask_blend_ps(__U, __A, __W) \
13726 ((__m256) __builtin_ia32_blendmps_256_mask ((__v8sf) (__A), \
13730#define _mm256_mask_blend_epi64(__U, __A, __W) \
13731 ((__m256i) __builtin_ia32_blendmq_256_mask ((__v4di) (__A), \
13735#define _mm256_mask_blend_epi32(__U, __A, __W) \
13736 ((__m256i) __builtin_ia32_blendmd_256_mask ((__v8si) (__A), \
13740#define _mm_mask_blend_pd(__U, __A, __W) \
13741 ((__m128d) __builtin_ia32_blendmpd_128_mask ((__v2df) (__A), \
13745#define _mm_mask_blend_ps(__U, __A, __W) \
13746 ((__m128) __builtin_ia32_blendmps_128_mask ((__v4sf) (__A), \
13750#define _mm_mask_blend_epi64(__U, __A, __W) \
13751 ((__m128i) __builtin_ia32_blendmq_128_mask ((__v2di) (__A), \
13755#define _mm_mask_blend_epi32(__U, __A, __W) \
13756 ((__m128i) __builtin_ia32_blendmd_128_mask ((__v4si) (__A), \
13760#define _mm256_cmp_epu32_mask(X, Y, P) \
13761 ((__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si)(__m256i)(X), \
13762 (__v8si)(__m256i)(Y), (int)(P),\
13765#define _mm256_cmp_epi64_mask(X, Y, P) \
13766 ((__mmask8) __builtin_ia32_cmpq256_mask ((__v4di)(__m256i)(X), \
13767 (__v4di)(__m256i)(Y), (int)(P),\
13770#define _mm256_cmp_epi32_mask(X, Y, P) \
13771 ((__mmask8) __builtin_ia32_cmpd256_mask ((__v8si)(__m256i)(X), \
13772 (__v8si)(__m256i)(Y), (int)(P),\
13775#define _mm256_cmp_epu64_mask(X, Y, P) \
13776 ((__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di)(__m256i)(X), \
13777 (__v4di)(__m256i)(Y), (int)(P),\
13780#define _mm256_cmp_pd_mask(X, Y, P) \
13781 ((__mmask8) __builtin_ia32_cmppd256_mask ((__v4df)(__m256d)(X), \
13782 (__v4df)(__m256d)(Y), (int)(P),\
13785#define _mm256_cmp_ps_mask(X, Y, P) \
13786 ((__mmask8) __builtin_ia32_cmpps256_mask ((__v8sf)(__m256)(X), \
13787 (__v8sf)(__m256)(Y), (int)(P),\
13790#define _mm256_mask_cmp_epi64_mask(M, X, Y, P) \
13791 ((__mmask8) __builtin_ia32_cmpq256_mask ((__v4di)(__m256i)(X), \
13792 (__v4di)(__m256i)(Y), (int)(P),\
13795#define _mm256_mask_cmp_epi32_mask(M, X, Y, P) \
13796 ((__mmask8) __builtin_ia32_cmpd256_mask ((__v8si)(__m256i)(X), \
13797 (__v8si)(__m256i)(Y), (int)(P),\
13800#define _mm256_mask_cmp_epu64_mask(M, X, Y, P) \
13801 ((__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di)(__m256i)(X), \
13802 (__v4di)(__m256i)(Y), (int)(P),\
13805#define _mm256_mask_cmp_epu32_mask(M, X, Y, P) \
13806 ((__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si)(__m256i)(X), \
13807 (__v8si)(__m256i)(Y), (int)(P),\
13810#define _mm256_mask_cmp_pd_mask(M, X, Y, P) \
13811 ((__mmask8) __builtin_ia32_cmppd256_mask ((__v4df)(__m256d)(X), \
13812 (__v4df)(__m256d)(Y), (int)(P),\
13815#define _mm256_mask_cmp_ps_mask(M, X, Y, P) \
13816 ((__mmask8) __builtin_ia32_cmpps256_mask ((__v8sf)(__m256)(X), \
13817 (__v8sf)(__m256)(Y), (int)(P),\
13820#define _mm_cmp_epi64_mask(X, Y, P) \
13821 ((__mmask8) __builtin_ia32_cmpq128_mask ((__v2di)(__m128i)(X), \
13822 (__v2di)(__m128i)(Y), (int)(P),\
13825#define _mm_cmp_epi32_mask(X, Y, P) \
13826 ((__mmask8) __builtin_ia32_cmpd128_mask ((__v4si)(__m128i)(X), \
13827 (__v4si)(__m128i)(Y), (int)(P),\
13830#define _mm_cmp_epu64_mask(X, Y, P) \
13831 ((__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di)(__m128i)(X), \
13832 (__v2di)(__m128i)(Y), (int)(P),\
13835#define _mm_cmp_epu32_mask(X, Y, P) \
13836 ((__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si)(__m128i)(X), \
13837 (__v4si)(__m128i)(Y), (int)(P),\
13840#define _mm_cmp_pd_mask(X, Y, P) \
13841 ((__mmask8) __builtin_ia32_cmppd128_mask ((__v2df)(__m128d)(X), \
13842 (__v2df)(__m128d)(Y), (int)(P),\
13845#define _mm_cmp_ps_mask(X, Y, P) \
13846 ((__mmask8) __builtin_ia32_cmpps128_mask ((__v4sf)(__m128)(X), \
13847 (__v4sf)(__m128)(Y), (int)(P),\
13850#define _mm_mask_cmp_epi64_mask(M, X, Y, P) \
13851 ((__mmask8) __builtin_ia32_cmpq128_mask ((__v2di)(__m128i)(X), \
13852 (__v2di)(__m128i)(Y), (int)(P),\
13855#define _mm_mask_cmp_epi32_mask(M, X, Y, P) \
13856 ((__mmask8) __builtin_ia32_cmpd128_mask ((__v4si)(__m128i)(X), \
13857 (__v4si)(__m128i)(Y), (int)(P),\
13860#define _mm_mask_cmp_epu64_mask(M, X, Y, P) \
13861 ((__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di)(__m128i)(X), \
13862 (__v2di)(__m128i)(Y), (int)(P),\
13865#define _mm_mask_cmp_epu32_mask(M, X, Y, P) \
13866 ((__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si)(__m128i)(X), \
13867 (__v4si)(__m128i)(Y), (int)(P),\
13870#define _mm_mask_cmp_pd_mask(M, X, Y, P) \
13871 ((__mmask8) __builtin_ia32_cmppd128_mask ((__v2df)(__m128d)(X), \
13872 (__v2df)(__m128d)(Y), (int)(P),\
13875#define _mm_mask_cmp_ps_mask(M, X, Y, P) \
13876 ((__mmask8) __builtin_ia32_cmpps128_mask ((__v4sf)(__m128)(X), \
13877 (__v4sf)(__m128)(Y), (int)(P),\
13882#define _mm256_permutexvar_ps(A, B) _mm256_permutevar8x32_ps ((B), (A))
13883#define _mm256_mask_cvt_roundps_ph(A, B, C, D) \
13884 _mm256_mask_cvtps_ph ((A), (B), (C), (D))
13885#define _mm256_maskz_cvt_roundps_ph(A, B, C) \
13886 _mm256_maskz_cvtps_ph ((A), (B), (C))
13887#define _mm_mask_cvt_roundps_ph(A, B, C, D) \
13888 _mm_mask_cvtps_ph ((A), (B), (C), (D))
13889#define _mm_maskz_cvt_roundps_ph(A, B, C) _mm_maskz_cvtps_ph ((A), (B), (C))
13891#ifdef __DISABLE_AVX512VL__
13892#undef __DISABLE_AVX512VL__
13893#pragma GCC pop_options
__inline __m512i int int int int int int int int int int int int int int __O
__inline __m512d __m512d __m512d __W
#define _mm256_mmask_i64gather_pd(V1OLD, MASK, INDEX, ADDR, SCALE)
#define _mm_mask_roundscale_pd(W, U, A, B)
#define _mm256_mask_cmp_pd_mask(M, X, Y, P)
#define _mm_i32scatter_pd(ADDR, INDEX, V1, SCALE)
#define _mm_i32scatter_ps(ADDR, INDEX, V1, SCALE)
#define _mm_mask_i32scatter_epi32(ADDR, MASK, INDEX, V1, SCALE)
#define _mm_mmask_i32gather_epi32(V1OLD, MASK, INDEX, ADDR, SCALE)
#define _mm_mask_cmp_epu64_mask(M, X, Y, P)
#define _mm256_maskz_ror_epi64(U, A, B)
#define _mm256_mask_permute_pd(W, U, X, C)
#define _mm256_shuffle_i32x4(X, Y, C)
#define _mm_mask_ror_epi64(W, U, A, B)
#define _mm256_mask_i32scatter_pd(ADDR, MASK, INDEX, V1, SCALE)
#define _mm_mask_srai_epi32(W, U, A, B)
#define _mm256_mask_blend_ps(__U, __A, __W)
#define _mm_mask_shuffle_epi32(W, U, X, C)
#define _mm_srai_epi64(A, B)
#define _mm_i64scatter_ps(ADDR, INDEX, V1, SCALE)
#define _mm256_roundscale_pd(A, B)
#define _mm_roundscale_pd(A, B)
#define _mm256_maskz_ternarylogic_epi32(U, A, B, C, I)
#define _mm256_getmant_ps(X, B, C)
#define _mm256_i32scatter_epi64(ADDR, INDEX, V1, SCALE)
#define _mm256_maskz_shuffle_ps(U, A, B, C)
__inline __m256i __mmask8 __m128i __X
#define _mm_alignr_epi64(X, Y, C)
#define _mm256_i32scatter_ps(ADDR, INDEX, V1, SCALE)
#define _mm_cmp_epu32_mask(X, Y, P)
#define _mm256_mmask_i32gather_ps(V1OLD, MASK, INDEX, ADDR, SCALE)
#define _mm256_getmant_pd(X, B, C)
#define _mm_mask_i64scatter_epi32(ADDR, MASK, INDEX, V1, SCALE)
#define _mm_mask_i32scatter_pd(ADDR, MASK, INDEX, V1, SCALE)
#define _mm_maskz_fixupimm_pd(U, X, Y, Z, C)
#define _mm_mask_permute_ps(W, U, X, C)
#define _mm256_maskz_srai_epi64(U, A, B)
#define _mm_maskz_alignr_epi32(U, X, Y, C)
#define _mm_maskz_rol_epi64(U, A, B)
#define _mm256_maskz_permute_ps(U, X, C)
#define _mm256_mask_getmant_pd(W, U, X, B, C)
#define _mm256_maskz_shuffle_f64x2(U, X, Y, C)
#define _mm_mask_shuffle_ps(W, U, A, B, C)
#define _mm256_mask_permutex_epi64(W, M, X, I)
#define _mm256_mask_srli_epi32(W, U, A, B)
#define _mm256_alignr_epi32(X, Y, C)
#define _mm256_maskz_fixupimm_pd(U, X, Y, Z, C)
#define _mm256_maskz_permutex_epi64(M, X, I)
#define _mm_mask_i64scatter_epi64(ADDR, MASK, INDEX, V1, SCALE)
#define _mm256_mask_fixupimm_pd(X, U, Y, Z, C)
#define _mm256_permutex_epi64(X, I)
#define _mm_mmask_i32gather_pd(V1OLD, MASK, INDEX, ADDR, SCALE)
#define _mm256_mask_insertf32x4(W, U, X, Y, C)
#define _mm_maskz_ternarylogic_epi64(U, A, B, C, I)
#define _mm_cmp_ps_mask(X, Y, P)
#define _mm_cmp_epi32_mask(X, Y, P)
#define _mm_mask_getmant_ps(W, U, X, B, C)
#define _mm256_mask_srli_epi64(W, U, A, B)
#define _mm256_maskz_slli_epi64(U, X, C)
#define _mm_mask_cvtps_ph(W, U, A, I)
#define _mm256_maskz_roundscale_pd(U, A, B)
#define _mm_mmask_i64gather_ps(V1OLD, MASK, INDEX, ADDR, SCALE)
#define _mm256_shuffle_i64x2(X, Y, C)
#define _mm256_mask_i64scatter_epi64(ADDR, MASK, INDEX, V1, SCALE)
#define _mm256_mmask_i32gather_epi64(V1OLD, MASK, INDEX, ADDR, SCALE)
#define _mm256_maskz_cvtps_ph(U, A, I)
#define _mm256_shuffle_f32x4(X, Y, C)
#define _mm_mask_cmp_ps_mask(M, X, Y, P)
#define _mm256_i64scatter_epi32(ADDR, INDEX, V1, SCALE)
#define _mm256_maskz_rol_epi32(U, A, B)
#define _mm256_mask_blend_pd(__U, __A, __W)
#define _mm256_mask_cmp_ps_mask(M, X, Y, P)
#define _mm256_mask_cmp_epu64_mask(M, X, Y, P)
#define _mm256_maskz_shuffle_i32x4(U, X, Y, C)
#define _mm256_maskz_getmant_pd(U, X, B, C)
#define _mm_i64scatter_epi64(ADDR, INDEX, V1, SCALE)
#define _mm_maskz_srli_epi32(U, A, B)
#define _mm_mmask_i32gather_ps(V1OLD, MASK, INDEX, ADDR, SCALE)
#define _mm_maskz_roundscale_pd(U, A, B)
#define _mm_mask_rol_epi64(W, U, A, B)
#define _mm_mask_blend_ps(__U, __A, __W)
#define _mm256_maskz_shuffle_epi32(U, X, C)
#define _mm256_cmp_epu32_mask(X, Y, P)
#define _mm_fixupimm_pd(X, Y, Z, C)
#define _mm_mask_permute_pd(W, U, X, C)
#define _mm256_maskz_slli_epi32(U, X, C)
#define _mm256_rol_epi64(A, B)
#define _mm256_cmp_epi64_mask(X, Y, P)
#define _mm_mmask_i64gather_epi32(V1OLD, MASK, INDEX, ADDR, SCALE)
#define _mm_ror_epi32(A, B)
__inline __m128d __mmask8 __m128d __m128d __B
#define _mm_maskz_srai_epi32(U, A, B)
#define _mm256_mask_i64scatter_ps(ADDR, MASK, INDEX, V1, SCALE)
#define _mm_mask_i32scatter_epi64(ADDR, MASK, INDEX, V1, SCALE)
#define _mm256_mask_srai_epi32(W, U, A, B)
#define _mm256_permutex_pd(X, M)
#define _mm256_mask_i32scatter_ps(ADDR, MASK, INDEX, V1, SCALE)
#define _mm_maskz_ror_epi32(U, A, B)
#define _mm256_mask_slli_epi64(W, U, X, C)
#define _mm_mmask_i64gather_pd(V1OLD, MASK, INDEX, ADDR, SCALE)
#define _mm256_insertf32x4(X, Y, C)
#define _mm256_maskz_ternarylogic_epi64(U, A, B, C, I)
#define _mm_mask_roundscale_ps(W, U, A, B)
#define _mm256_maskz_insertf32x4(U, X, Y, C)
#define _mm256_mask_i64scatter_pd(ADDR, MASK, INDEX, V1, SCALE)
#define _mm_mask_slli_epi64(W, U, X, C)
#define _mm_mask_cmp_epu32_mask(M, X, Y, P)
#define _mm256_mask_shuffle_i32x4(W, U, X, Y, C)
#define _mm256_maskz_permutex_pd(U, A, B)
#define _mm256_mmask_i32gather_epi32(V1OLD, MASK, INDEX, ADDR, SCALE)
#define _mm256_rol_epi32(A, B)
#define _mm_mask_blend_pd(__U, __A, __W)
#define _mm_fixupimm_ps(X, Y, Z, C)
#define _mm_cmp_epi64_mask(X, Y, P)
#define _mm256_mask_ternarylogic_epi64(A, U, B, C, I)
#define _mm256_alignr_epi64(X, Y, C)
#define _mm_maskz_rol_epi32(U, A, B)
#define _mm_mask_srli_epi64(W, U, A, B)
#define _mm256_maskz_shuffle_i64x2(U, X, Y, C)
#define _mm256_maskz_roundscale_ps(U, A, B)
#define _mm_roundscale_ps(A, B)
#define _mm_ternarylogic_epi64(A, B, C, I)
#define _mm256_mask_shuffle_ps(W, U, A, B, C)
#define _mm256_ror_epi64(A, B)
#define _mm256_roundscale_ps(A, B)
#define _mm256_shuffle_f64x2(X, Y, C)
#define _mm256_mask_alignr_epi32(W, U, X, Y, C)
#define _mm_mmask_i32gather_epi64(V1OLD, MASK, INDEX, ADDR, SCALE)
#define _mm_mask_fixupimm_pd(X, U, Y, Z, C)
#define _mm256_mask_rol_epi64(W, U, A, B)
#define _mm256_maskz_ror_epi32(U, A, B)
#define _mm_getmant_pd(X, B, C)
#define _mm256_ror_epi32(A, B)
#define _mm256_maskz_alignr_epi64(U, X, Y, C)
#define _mm_rol_epi64(A, B)
#define _mm256_inserti32x4(X, Y, C)
#define _mm_i64scatter_epi32(ADDR, INDEX, V1, SCALE)
__inline __m256d __mmask8 __U
#define _mm256_mask_fixupimm_ps(X, U, Y, Z, C)
#define _mm_mask_cmp_epi32_mask(M, X, Y, P)
#define _mm256_mask_shuffle_f64x2(W, U, X, Y, C)
#define _mm256_maskz_shuffle_pd(U, A, B, C)
#define _mm256_mask_blend_epi64(__U, __A, __W)
#define _mm_cmp_pd_mask(X, Y, P)
#define _mm256_mask_roundscale_ps(W, U, A, B)
#define _mm256_mask_getmant_ps(W, U, X, B, C)
#define _mm_mask_ror_epi32(W, U, A, B)
#define _mm256_maskz_extracti32x4_epi32(U, X, C)
#define _mm_ror_epi64(A, B)
#define _mm256_mask_shuffle_pd(W, U, A, B, C)
__inline __m256d __mmask8 void const * __P
__inline void __mmask8 __M
#define _mm_mask_fixupimm_ps(X, U, Y, Z, C)
#define _mm256_fixupimm_pd(X, Y, Z, C)
#define _mm_maskz_ternarylogic_epi32(U, A, B, C, I)
#define _mm256_mask_inserti32x4(W, U, X, Y, C)
__inline __m256d __mmask8 __m256d __m256d __C
#define _mm_maskz_fixupimm_ps(U, X, Y, Z, C)
#define _mm256_mmask_i64gather_ps(V1OLD, MASK, INDEX, ADDR, SCALE)
#define _mm256_mask_cmp_epi32_mask(M, X, Y, P)
#define _mm_maskz_shuffle_epi32(U, X, C)
#define _mm_maskz_slli_epi32(U, X, C)
#define _mm256_mask_cmp_epi64_mask(M, X, Y, P)
#define _mm_mask_getmant_pd(W, U, X, B, C)
#define _mm_getmant_ps(X, B, C)
#define _mm256_ternarylogic_epi64(A, B, C, I)
#define _mm256_maskz_srli_epi32(U, A, B)
#define _mm_mask_i32scatter_ps(ADDR, MASK, INDEX, V1, SCALE)
#define _mm256_maskz_shuffle_f32x4(U, X, Y, C)
#define _mm256_ternarylogic_epi32(A, B, C, I)
#define _mm_mask_cmp_epi64_mask(M, X, Y, P)
#define _mm256_mask_srai_epi64(W, U, A, B)
#define _mm_mask_i64scatter_pd(ADDR, MASK, INDEX, V1, SCALE)
#define _mm256_i64scatter_epi64(ADDR, INDEX, V1, SCALE)
#define _mm256_mask_cvtps_ph(W, U, A, I)
#define _mm256_mask_shuffle_f32x4(W, U, X, Y, C)
#define _mm_maskz_permute_ps(U, X, C)
#define _mm_mask_ternarylogic_epi64(A, U, B, C, I)
#define _mm_mask_ternarylogic_epi32(A, U, B, C, I)
#define _mm256_fixupimm_ps(X, Y, Z, C)
#define _mm_mask_rol_epi32(W, U, A, B)
#define _mm_maskz_shuffle_ps(U, A, B, C)
#define _mm256_mmask_i64gather_epi64(V1OLD, MASK, INDEX, ADDR, SCALE)
#define _mm256_maskz_fixupimm_ps(U, X, Y, Z, C)
#define _mm256_maskz_srai_epi32(U, A, B)
#define _mm_mask_slli_epi32(W, U, X, C)
#define _mm_maskz_alignr_epi64(U, X, Y, C)
#define _mm256_maskz_inserti32x4(U, X, Y, C)
#define _mm_i64scatter_pd(ADDR, INDEX, V1, SCALE)
#define _mm_maskz_shuffle_pd(U, A, B, C)
#define _mm_mmask_i64gather_epi64(V1OLD, MASK, INDEX, ADDR, SCALE)
#define _mm_maskz_getmant_ps(U, X, B, C)
#define _mm_i32scatter_epi64(ADDR, INDEX, V1, SCALE)
#define _mm256_cmp_epi32_mask(X, Y, P)
#define _mm256_mask_extractf32x4_ps(W, U, X, C)
#define _mm256_maskz_alignr_epi32(U, X, Y, C)
#define _mm256_mask_blend_epi32(__U, __A, __W)
#define _mm256_mask_i32scatter_epi64(ADDR, MASK, INDEX, V1, SCALE)
#define _mm256_maskz_srli_epi64(U, A, B)
#define _mm256_maskz_getmant_ps(U, X, B, C)
#define _mm256_maskz_permute_pd(U, X, C)
#define _mm256_mmask_i32gather_pd(V1OLD, MASK, INDEX, ADDR, SCALE)
#define _mm256_cmp_epu64_mask(X, Y, P)
#define _mm256_mask_permutex_pd(W, U, A, B)
#define _mm256_i32scatter_epi32(ADDR, INDEX, V1, SCALE)
#define _mm256_i64scatter_ps(ADDR, INDEX, V1, SCALE)
#define _mm_maskz_permute_pd(U, X, C)
__inline __m128i __m128i __Y
#define _mm256_mask_i32scatter_epi32(ADDR, MASK, INDEX, V1, SCALE)
#define _mm_alignr_epi32(X, Y, C)
#define _mm256_mask_ror_epi64(W, U, A, B)
#define _mm256_mask_roundscale_pd(W, U, A, B)
#define _mm256_srai_epi64(A, B)
#define _mm256_mask_extracti32x4_epi32(W, U, X, C)
#define _mm256_mask_permute_ps(W, U, X, C)
#define _mm256_mask_shuffle_epi32(W, U, X, C)
#define _mm256_maskz_extractf32x4_ps(U, X, C)
#define _mm_maskz_getmant_pd(U, X, B, C)
#define _mm256_mask_shuffle_i64x2(W, U, X, Y, C)
#define _mm_mask_alignr_epi64(W, U, X, Y, C)
#define _mm_i32scatter_epi32(ADDR, INDEX, V1, SCALE)
#define _mm256_mask_slli_epi32(W, U, X, C)
#define _mm256_mask_ternarylogic_epi32(A, U, B, C, I)
#define _mm256_mmask_i64gather_epi32(V1OLD, MASK, INDEX, ADDR, SCALE)
__inline __m256d __m256i __I
#define _mm256_mask_alignr_epi64(W, U, X, Y, C)
#define _mm256_cmp_ps_mask(X, Y, P)
#define _mm256_i32scatter_pd(ADDR, INDEX, V1, SCALE)
#define _mm256_extracti32x4_epi32(X, C)
__inline __m256d __mmask8 __m256d __A
#define _mm256_i64scatter_pd(ADDR, INDEX, V1, SCALE)
#define _mm_mask_srai_epi64(W, U, A, B)
#define _mm256_maskz_rol_epi64(U, A, B)
#define _mm_mask_i64scatter_ps(ADDR, MASK, INDEX, V1, SCALE)
#define _mm256_mask_i64scatter_epi32(ADDR, MASK, INDEX, V1, SCALE)
#define _mm_mask_alignr_epi32(W, U, X, Y, C)
#define _mm_ternarylogic_epi32(A, B, C, I)
#define _mm_maskz_roundscale_ps(U, A, B)
#define _mm256_extractf32x4_ps(X, C)
#define _mm_mask_blend_epi32(__U, __A, __W)
#define _mm_mask_cmp_pd_mask(M, X, Y, P)
#define _mm_maskz_srai_epi64(U, A, B)
#define _mm256_mask_ror_epi32(W, U, A, B)
#define _mm256_mask_cmp_epu32_mask(M, X, Y, P)
#define _mm_mask_blend_epi64(__U, __A, __W)
int __v4si_u __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)))
#define _mm_maskz_cvtps_ph(U, A, I)
#define _mm256_cmp_pd_mask(X, Y, P)
#define _mm_mask_shuffle_pd(W, U, A, B, C)
#define _mm256_mask_rol_epi32(W, U, A, B)
#define _mm_maskz_slli_epi64(U, X, C)
#define _mm_maskz_ror_epi64(U, A, B)
#define _mm_mask_srli_epi32(W, U, A, B)
#define _mm_maskz_srli_epi64(U, A, B)
#define _mm_rol_epi32(A, B)
#define _mm_cmp_epu64_mask(X, Y, P)